Build a Model That will help the marketting team- to identify potential customers- who are relatively more likely to subsc=ribe to term deposit and thus increase hit ratio
#Import necessary modules & Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
%matplotlib inline
#Supress warnings
import warnings
warnings.filterwarnings('ignore')
#For numerical libraries
#styling figures
plt.rc('font',size=14)
sns.set(style='white')
sns.set(style='whitegrid',color_codes=True)
#importing the Encoding library
from sklearn.preprocessing import LabelEncoder
#Import SMOTE library for handling imbalance class
from imblearn.over_sampling import SMOTE
#Import Decision Tree Classifier machine learning Library
from sklearn.tree import DecisionTreeClassifier
# Import Logistic Regression machine learning library
from sklearn.linear_model import LogisticRegression
#Import Naive Bayes' machine learning Library
from sklearn.naive_bayes import GaussianNB
#Import Sklearn package's data splitting function which is based on random function
from sklearn.model_selection import train_test_split
#Import the metrics
from sklearn import metrics
#Import the Voting classifier for Ensemble
from sklearn.ensemble import VotingClassifier
bankdata_df = pd.read_csv('bank-full.csv')
bankdata_df
bankdata_df.shape
bankdata_df.info()
# summary of data
bankdata_df.isnull().sum()
# no missing values
bankdata_df.dtypes
# type of data, object columns need to be converted into categorical later in this exercise
#cols = ['job', 'marital', 'education', 'default','housing', 'loan', 'contact', 'month', 'poutcome']
#bankdata_df[cols] = bankdata_df[cols].astype('category')
bankdata_df.shape
bankdata_df.describe().T
# we observe skeweness in data
# Analysis for Job data
y=bankdata_df.job
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
#distribution of job data
fig, ax=plt.subplots()
fig.set_size_inches(15,10)
#countplot
sns.countplot(bankdata_df['job'],data=bankdata_df)
ax1.set_xlabel('Job', fontsize=18)
ax1.set_ylabel('Count', fontsize=20)
ax1.set_title('Job vs Count', fontsize=20)
ax1.tick_params(labelsize=20)
#Marital
y=bankdata_df.marital
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bankdata_df['marital'],palette='Set2')
#Education analysis
y=bankdata_df.education
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bankdata_df['education'],palette='Set2')
#education analysis
y=bankdata_df.default
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bankdata_df['default'],palette='Set2')
#Housing analysis
y=bankdata_df.housing
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bankdata_df['housing'],palette='Set2')
#Loan analysis
y=bankdata_df.loan
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bankdata_df['loan'],palette='Set2')
#Contact analysis
y=bankdata_df.contact
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bankdata_df['contact'],palette='Set3')
#Month analysis
y=bankdata_df.month
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(10,5))
sns.countplot(bankdata_df['month'],color='yellow')
#Poutcome analysis
y=bankdata_df.poutcome
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bankdata_df['poutcome'],palette='Set1')
#Target analysis
y=bankdata_df.Target
counts=y.value_counts()
percent=y.value_counts(normalize=True).mul(100).round(2)
print(pd.DataFrame({'counts':counts,'percent':percent}))
plt.figure(figsize=(5,5))
sns.countplot(bankdata_df['Target'],palette='Set1')
# majority of data falls as No to subscription
# Age distribution & Skewness
print(sns.distplot(bankdata_df.age))
print("Skweness is: ",bankdata_df['age'].skew())
sns.boxplot(x='age',data=bankdata_df)
# outliers
# Balance distribution & Skewness
print(sns.distplot(bankdata_df.balance))
print("Skweness is: ",bankdata_df['balance'].skew())
sns.boxplot(y='balance',data=bankdata_df)
# Outliers
# Day distribution & Skewness
print(sns.distplot(bankdata_df.day))
print("Skweness is: ",bankdata_df['day'].skew())
#Multimodel distribution
sns.boxplot(x='day',data=bankdata_df)
# Duration distribution & Skewness
print(sns.distplot(bankdata_df.duration))
print("Skweness is: ",bankdata_df['duration'].skew())
sns.boxplot(y='duration',data=bankdata_df)
#outliers (hug data falls in it)
# Campaign distribution & Skewness
print(sns.distplot(bankdata_df.campaign))
print("Skweness is: ",bankdata_df['campaign'].skew())
sns.boxplot(y='campaign',data=bankdata_df)
# Pdays distribution & Skewness
print(sns.distplot(bankdata_df.pdays))
print("Skweness is: ",bankdata_df['pdays'].skew())
sns.boxplot(y='pdays',data=bankdata_df)
# Previous distribution & Skewness
print(sns.distplot(bankdata_df.previous))
print("Skweness is: ",bankdata_df['previous'].skew())
sns.boxplot(y='previous',data=bankdata_df)
bankdata_df.head()
bankdata_df.describe().T
bankdata_df.groupby(['Target']).agg(['mean','median'])
print(pd.crosstab(bankdata_df['job'],bankdata_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(15,5))
sns.countplot(x='job',hue='Target',data=bankdata_df)
# No to subscription is dominant data
print(pd.crosstab(bankdata_df['marital'],bankdata_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(15,5))
sns.countplot(x='marital',hue='Target',data=bankdata_df)
print(pd.crosstab(bankdata_df['education'],bankdata_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(10,5))
sns.countplot(x='education',hue='Target',data=bankdata_df)
print(pd.crosstab(bankdata_df['default'],bankdata_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(10,5))
sns.countplot(x='default',hue='Target',data=bankdata_df)
print(pd.crosstab(bankdata_df['housing'],bankdata_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(10,5))
sns.countplot(x='housing',hue='Target',data=bankdata_df)
print(pd.crosstab(bankdata_df['loan'],bankdata_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(10,5))
sns.countplot(x='loan',hue='Target',data=bankdata_df)
print(pd.crosstab(bankdata_df['contact'],bankdata_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(10,5))
sns.countplot(x='contact',hue='Target',data=bankdata_df)
print(pd.crosstab(bankdata_df['month'],bankdata_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(10,5))
sns.countplot(x='month',hue='Target',data=bankdata_df)
print(pd.crosstab(bankdata_df['poutcome'],bankdata_df['Target'],normalize='index').mul(100).sort_values(by='yes',ascending=False))
plt.figure(figsize=(10,5))
sns.countplot(x='poutcome',hue='Target',data=bankdata_df)
fig, ax=plt.subplots(2,2,figsize=(10,6))
#Target vs Age, balance, (categorical vs Continuous)
#boxplot
sns.boxplot(x="Target", y="age", data=bankdata_df,ax=ax[0][0])
#barplot
sns.barplot(x="Target", y="age", data=bankdata_df,ax=ax[0][1])
# Categorical vs Continuous ----Target vs balance
#boxplot
sns.boxplot(x="Target", y="balance", data=bankdata_df,ax=ax[1][0])
#barplot
sns.barplot(x="Target", y="balance", data=bankdata_df,ax=ax[1][1])
plt.subplots_adjust(wspace=0.5)
plt.tight_layout()
bankdata_df['Target']=bankdata_df.Target.astype('category')
bankdata_df['job']=bankdata_df.job.astype('category')
bankdata_df['marital']=bankdata_df.marital.astype('category')
bankdata_df['education']=bankdata_df.education.astype('category')
bankdata_df['default']=bankdata_df.default.astype('category')
bankdata_df['housing']=bankdata_df.housing.astype('category')
bankdata_df['loan']=bankdata_df.loan.astype('category')
bankdata_df['contact']=bankdata_df.contact.astype('category')
bankdata_df['month']=bankdata_df.month.astype('category')
bankdata_df['poutcome']=bankdata_df.poutcome.astype('category')
bankdata_df.dtypes
#Optimize the number of classes in job
bankdata_df['job']=np.where(bankdata_df['job']=='blue-collar','worker',bankdata_df['job'])
bankdata_df['job']=np.where(bankdata_df['job']=='housemaid','worker',bankdata_df['job'])
bankdata_df['job']=np.where(bankdata_df['job']=='management','managerial',bankdata_df['job'])
bankdata_df['job']=np.where(bankdata_df['job']=='services','managerial',bankdata_df['job'])
bankdata_df['job']=np.where(bankdata_df['job']=='admin','managerial',bankdata_df['job'])
bankdata_df['job']=np.where(bankdata_df['job']=='enterpreneur','managerial',bankdata_df['job'])
bankdata_df['job'].unique()
#Encoding of categorical variables
labelencoder_X=LabelEncoder()
bankdata_df['job']=labelencoder_X.fit_transform(bankdata_df['job'])
bankdata_df['marital']=labelencoder_X.fit_transform(bankdata_df['marital'])
bankdata_df['education']=labelencoder_X.fit_transform(bankdata_df['education'])
bankdata_df['default']=labelencoder_X.fit_transform(bankdata_df['default'])
bankdata_df['housing']=labelencoder_X.fit_transform(bankdata_df['housing'])
bankdata_df['loan']=labelencoder_X.fit_transform(bankdata_df['loan'])
bankdata_df['contact']= labelencoder_X.fit_transform(bankdata_df['contact'])
bankdata_df['month']= labelencoder_X.fit_transform(bankdata_df['month'])
bankdata_df['poutcome'] = labelencoder_X.fit_transform(bankdata_df['poutcome'])
bankdata_df['Target'] = labelencoder_X.fit_transform(bankdata_df['Target'])
bankdata_df.head()
bankdata_df.describe().T
# Corrlation Matrix
cor=bankdata_df.corr()
cor
#Heatmap Plot (correlation)
sns.set(rc={'figure.figsize':(16,12)})
sns.heatmap(bankdata_df.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show()
sns.pairplot(bankdata_df,hue='Target')
# Model:
# Spliting Data
#independent and dependent variables
X=bankdata_df.loc[:,bankdata_df.columns!='Target']
y=bankdata_df.loc[:,bankdata_df.columns=='Target']
# Split X and y into training and test set in 70:30 ratio
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
columns=X_train.columns
# invoking the decision tree classifier function. Using 'entropy' method of finding the split columns. Other option
dt_model = DecisionTreeClassifier(criterion = 'entropy',random_state=100 )
#Fitting the model
dt_model.fit(X_train, y_train)
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
bankdata_df=bankdata_df
xvar = bankdata_df.drop('Target', axis=1)
feature_cols = xvar.columns
dot_data = StringIO()
export_graphviz(dt_model, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('bankdata_df.png')
Image(graph.create_png())
# Complex tree- probably need to try with Pruning
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))
# higher the number- will impact Target more., Month, duration & Balance
# prediction
y_predict = dt_model.predict(X_test)
# performance on train data
print('Performance on Training data using DT: ',dt_model.score(X_train , y_train))
# performance on test data
print('Performance on Testing data using DT: ',dt_model.score(X_test , y_test))
# Training 100% - overfitting, while on testing accuracy dropped.
# accuracy, confusion metrix, and classification report
acc_DT=metrics.accuracy_score(y_test, y_predict)
print('Accuracy DT: ',acc_DT)
print('Confusion Matrix DT: \n',metrics.confusion_matrix(y_test, y_predict))
print('Classification report DT: \n',metrics.classification_report(y_test, y_predict))
#with decision tree - 87% accuracy which leads to 13% error
#ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
dt_roc_auc = roc_auc_score(y_test, dt_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, dt_model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Decision Tree (area = %0.2f)' % dt_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.DataFrame({'Method':['Decision Tree'], 'accuracy': acc_DT},index={'1'})
results = results[['Method', 'accuracy']]
results
# Regularizing - Decision tree classifier & fitting the model
reg_dt_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 9,random_state=100,min_samples_leaf=5)
reg_dt_model.fit(X_train, y_train)
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))
# same as pre pruning: month, duration and balance impact Target more
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
bankdata_df=bankdata_df
xvar = bankdata_df.drop('Target', axis=1)
feature_cols = xvar.columns
dot_data = StringIO()
export_graphviz(reg_dt_model, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('bankdata_df_pruned.png')
Image(graph.create_png())
y_predict = reg_dt_model.predict(X_test)
# performance on train data
print('Performance on Training data using Pruned DT: ',reg_dt_model.score(X_train , y_train))
# performance on test data
print('Performance on Testing data using Pruned DT: ',reg_dt_model.score(X_test , y_test))
#In Training & testing data results are very close- Pruned DT model is doing well
#Evaluate the model using accuracy, confusion metrix, and classification report
acc_pruned_DT=metrics.accuracy_score(y_test, y_predict)
print("Accuracy Pruned DT:",acc_pruned_DT)
print('Confusion Matrix Pruned DT: \n',metrics.confusion_matrix(y_test, y_predict))
print('Classification report Pruned DT: \n',metrics.classification_report(y_test, y_predict))
#Regularized Decision Tree - accuracy is 90% higher than previous model.
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Pruned Decision Tree'], 'accuracy': [acc_pruned_DT]},index={'2'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
# Increasing Depth to 5,& to 9 increase in Pruned decision accuracy
# Randon Forest Classfier
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50,random_state=100)
rfcl = rfcl.fit(X_train, y_train)
y_predict = rfcl.predict(X_test)
acc_RF=metrics.accuracy_score(y_test , y_predict)
print('Accuracy using Random forest: ',acc_RF)
print('Confusion Matrix Random Forest: \n',metrics.confusion_matrix(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [acc_RF]},index={'3'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
# Bagging
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dt_model, n_estimators=100,random_state=100)
bgcl = bgcl.fit(X_train, y_train)
y_predict = bgcl.predict(X_test)
acc_bg=metrics.accuracy_score(y_test, y_predict)
print('Accuracy using Bagging: ',acc_bg)
print('Confusion Matrix using Bagging: \n',metrics.confusion_matrix(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [acc_bg]},index={'4'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(base_estimator=dt_model, n_estimators=100,random_state=100)
abcl = abcl.fit(X_train, y_train)
y_predict = abcl.predict(X_test)
acc_AB=metrics.accuracy_score(y_test, y_predict)
print('Accuracy using AdaBoosting: ',acc_AB)
print('Confusion Matrix: \n',metrics.confusion_matrix(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['AdaBoosting'], 'accuracy': [acc_AB]},index={'5'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
# For AdaBoosting- accuracy slightly dropped
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=100)
gbcl = gbcl.fit(X_train, y_train)
y_predict = gbcl.predict(X_test)
acc_GB=metrics.accuracy_score(y_test, y_predict)
print('Accuracy using GradientBoosting: ',acc_GB)
print('Confusion Matrix using GradientBoosting: \n',metrics.confusion_matrix(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['GradientBoosting'], 'accuracy': [acc_GB]},index={'6'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
#Gradient Boosting accuracy better than Ada Bossting & close to Pruned model.
# Enseble Logistic Regression, Decisiion Tree
#Multiple model Ensemble
from sklearn import svm
LR=LogisticRegression()
NB=GaussianNB()
#DT=DecisionTreeClassifier() ('DT',DT),
SVM=svm.SVC()
evc=VotingClassifier(estimators=[('LR',LR),('NB',NB),('SVM',SVM)],voting='hard')
evc.fit(X_train,y_train)
y_predict = evc.predict(X_test)
#evc.score(X_test,y_test)
acc_Ensemble=metrics.accuracy_score(y_test, y_predict)
print('Accuracy using Ensemble: ',acc_Ensemble)
print('Confusion Matrix: \n',metrics.confusion_matrix(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['Ensemble'], 'accuracy': [acc_Ensemble]},index={'7'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
# Not much improvement
# Random Forest, Bagging, Gradient Boosting & Prune decision gives highest accuracy- though others are not far off. We have to see with managing imbalance and normalizing colums results
as this data is biased to customers who in majority and too much skewed to customers who dont have subscription to terms
#independent and dependent variables
bankdata_scale=bankdata_df
X=bankdata_scale.loc[:,bankdata_scale.columns!='Target']
y=bankdata_scale.loc[:,bankdata_scale.columns=='Target']
# Split X and y into training and test set in 70:30 ratio
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
columns=X_train.columns
#for normalization
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
bankdata_scale.groupby('Target').size()
#Import the library for handling the imbalance dataset
from imblearn.over_sampling import SMOTE
Ov_sampling=SMOTE(random_state=100)
# now use SMOTE to oversample our train data which have features data_train_X and labels in data_train_y
ov_data_X,ov_data_y=Ov_sampling.fit_sample(X_train,y_train)
ov_data_X=pd.DataFrame(data=ov_data_X,columns=columns)
ov_data_y=pd.DataFrame(ov_data_y,columns=['Target'])
print('length of oversampled data ',len(ov_data_X))
print('Number of no subscription in oversampled data ' ,len(ov_data_y[ov_data_y['Target']==0]))
print('Number of subscription ' ,len(ov_data_y[ov_data_y['Target']==1]))
print('Proportion of no subscription data in oversampled data ' ,len(ov_data_y[ov_data_y['Target']==0])/len(ov_data_X))
print('Proportion of subscription data in oversampled data ' ,len(ov_data_y[ov_data_y['Target']==1])/len(ov_data_X))
ov_data_y['Target'].value_counts()
# invoking the decision tree classifier function. Using 'entropy' method of finding the split columns. Other option
dt_model = DecisionTreeClassifier(criterion = 'entropy',random_state=100 )
#Fitting the model
dt_model.fit(ov_data_X,ov_data_y)
#Making the prediction
y_predict = dt_model.predict(X_test)
#Evaluate the model using accuracy, confusion metrix, and classification report
acc_DT=metrics.accuracy_score(y_test, y_predict)
print('Accuracy DT: ',acc_DT)
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.DataFrame({'Method':['Decision Tree'], 'accuracy': acc_DT},index={'1'})
results = results[['Method', 'accuracy']]
results
# Regularizing the Decision tree classifier and fitting the model
reg_dt_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 9,random_state=100,min_samples_leaf=5)
reg_dt_model.fit(ov_data_X,ov_data_y)
y_predict = reg_dt_model.predict(X_test)
#Evaluate the model using accuracy, confusion metrix, and classification report
acc_pruned_DT=metrics.accuracy_score(y_test, y_predict)
print("Accuracy Pruned DT:",acc_pruned_DT)
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Pruned Decision Tree'], 'accuracy': [acc_pruned_DT]},index={'2'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
# accuracy even dropped in Prune Tree
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50,random_state=100)
rfcl = rfcl.fit(ov_data_X,ov_data_y)
y_predict = rfcl.predict(X_test)
acc_RF=metrics.accuracy_score(y_test , y_predict)
print('Accuracy using Random forest: ',acc_RF)
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [acc_RF]},index={'3'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
# much better accuracy with Random Forest
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dt_model, n_estimators=100,random_state=100)
bgcl = bgcl.fit(ov_data_X,ov_data_y)
y_predict = bgcl.predict(X_test)
acc_bg=metrics.accuracy_score(y_test, y_predict)
print('Accuracy using Bagging: ',acc_bg)
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [acc_bg]},index={'4'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(base_estimator=dt_model, n_estimators=100,random_state=100)
abcl = abcl.fit(ov_data_X,ov_data_y)
y_predict = abcl.predict(X_test)
acc_AB=metrics.accuracy_score(y_test, y_predict)
print('Accuracy using AdaBoosting: ',acc_AB)
tempResultsDf = pd.DataFrame({'Method':['AdaBoosting'], 'accuracy': [acc_AB]},index={'5'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
# accuracy dropped as compare to previous models
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=100)
gbcl = gbcl.fit(ov_data_X,ov_data_y)
y_predict = gbcl.predict(X_test)
acc_GB=metrics.accuracy_score(y_test, y_predict)
print('Accuracy using GradientBoosting: ',acc_GB)
tempResultsDf = pd.DataFrame({'Method':['GradientBoosting'], 'accuracy': [acc_GB]},index={'6'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
# accuracy further diluted
#Multiple model Ensemble
from sklearn import svm
LR=LogisticRegression()
NB=GaussianNB()
#DT=DecisionTreeClassifier() ('DT',DT),
SVM=svm.SVC()
evc=VotingClassifier(estimators=[('LR',LR),('NB',NB),('SVM',SVM)],voting='hard') evc.fit(ov_data_X,ov_data_y)
y_predict = evc.predict(X_test)
acc_Ensemble=metrics.accuracy_score(y_test, y_predict)
print('Accuracy using Ensemble: ',acc_Ensemble)
tempResultsDf = pd.DataFrame({'Method':['Ensemble'], 'accuracy': [acc_Ensemble]},index={'7'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results
*With sacling columns & managing imbalance the target column- Bagging & random Forest have the higher accuracy. As compared to non scale results diff between Bagging & Random Forest is relatively higher.